knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
library(ProjectTemplate)
load.project()

This notebook includes part of the preprocessing and data cleaning necessary for creating a model frame to pass to a function call which will execute the model fitting, including imputing any missing values using information contained in other columns for a given record.

We can start by removing any columns which do not exhibit any variability (which have no utility with respect to fitting a model.)

load(file.path("cache", "loan.RData"))
loan <- loan[, -match(names(which(sapply(loan, function(x) length(unique(x))) == 
    1)), names(loan))]

Date variables

To make use of the date columns, we can create a variable capturing the length of time between the event date and the date that the loan was issued. The credit_tenure_years variable captures the length of time that the loan applicant has been a customer with the lending company, and weeks_btwn_credit_pull_loan_issue captures the amount of time between the most recent credit pull of the loan applicant and the date that the loan was issued. Negative values indicate that the last credit pull was after the loan was issued, and positive values indicate that the most recent credit pull was prior to loan origination.

loan <- loan %>% mutate(cred_tenure_years = as.numeric(issue_d - earliest_cr_line)/365.5, 
    weeks_btwn_credit_pull_loan_issue = as.numeric(issue_d - last_credit_pull_d)/7) %>% 
    dplyr::select(-c(last_credit_pull_d, issue_d, earliest_cr_line))

Character variables

The term column has whitespace on the left side of the entries, so removing any whitespace from all character variables.

loan <- loan %>% mutate_if(is.character, funs(str_trim(., side = "both")))

Let’s take a look at the distribution of the categorical variables:

short_char_columns <- sapply(loan, is.character) %>% which %>% names %>% sapply(., 
    function(x) {
        length(unique(as.vector(data.frame(loan)[, x])))
    }) %>% magrittr::is_less_than(40) %>% which %>% names
short_char_columns
##  [1] "application_type"    "emp_length"          "grade"              
##  [4] "home_ownership"      "initial_list_status" "loan_status"        
##  [7] "purpose"             "sub_grade"           "term"               
## [10] "title"               "verification_status"
grid.arrange(loan %>% dplyr::select(loan_status, purpose, verification_status) %>% 
    gather(key = column, value = value) %>% ggplot(., aes(x = value)) + geom_bar() + 
    theme_bw() + facet_wrap(~column, nrow = 1, scales = "free") + theme(axis.text.x = element_text(angle = 35, 
    hjust = 1, vjust = 1, size = 6)) + xlab(""), loan %>% dplyr::select(initial_list_status, 
    grade, sub_grade) %>% gather(key = column, value = value) %>% ggplot(., 
    aes(x = value)) + geom_bar() + theme_bw() + theme(axis.text.x = element_text(angle = 35, 
    hjust = 1, vjust = 1, size = 5)) + facet_wrap(~column, nrow = 1, scales = "free") + 
    xlab(""), loan %>% dplyr::select(application_type, home_ownership, term, 
    emp_length) %>% gather(key = column, value = value) %>% ggplot(., aes(x = value)) + 
    geom_bar() + theme_bw() + facet_wrap(~column, nrow = 1, scales = "free") + 
    theme(axis.text.x = element_text(angle = 35, hjust = 1, vjust = 1)) + xlab(""), 
    loan %>% ggplot(., aes(x = addr_state)) + geom_bar() + theme_bw() + xlab("") + 
        theme(axis.text.x = element_text(angle = 35, hjust = 1, vjust = 1)), 
    ncol = 1, nrow = 4)

There are a few categorical variables which are notably unbalanced - application type, status, purpose, and home ownership. How is the “ANY” home ownership category defined? Given more information, depending on the definition, we may want to remove this category.

If we take a closer look at loan title, though there are a reasonable number of factor levels, there are still some very small categories.

loan %>% group_by(title, grade) %>% dplyr::count() %>% ungroup %>% group_by(title) %>% 
    mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(x = title, 
    y = p)) + geom_bar(aes(color = grade, fill = grade), stat = "identity") + 
    theme_bw() + scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1), panel.grid = element_blank()) + geom_text(aes(x = title, 
    y = 1.01, label = paste0("N = ", N)), angle = 90, size = 2, hjust = 0, vjust = 0.5) + 
    scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17)) + ylab("")

These can be aggregated and cleaned up:

title <- loan$title
title[title == "Prescription Drug and Medical Costs"] <- "Medical expenses"
title[title == "new kitchen for momma!"] <- "Home improvement"
title[title %in% c("DebtC", "Credit Card/Auto Repair", "Paying off higher interest cards & auto")] <- "Debt consolidation"
title[title %in% c("Pay off Lowes Card", "New Baby and New House (CC Consolidate)")] <- "Credit card refinancing"
title[title %in% c("Trying to come back to reality!", "Student Loan", "Simple Loan Until Contract Is Completed", 
    "SAVE", "odymeds", "considerate", "new day", "Learning and training", "Green loan")] <- "Other"
title[is.na(title)] <- "Other"
loan$title <- NULL
loan <- loan %>% mutate(title = title)
rm(title)

loan %>% group_by(title, grade) %>% dplyr::count() %>% ungroup %>% group_by(title) %>% 
    mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(x = title, 
    y = p)) + geom_bar(aes(color = grade, fill = grade), stat = "identity") + 
    theme_bw() + scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1), panel.grid = element_blank()) + geom_text(aes(x = title, 
    y = 1.01, label = paste0("N = ", N)), angle = 90, size = 2.7, hjust = 0, 
    vjust = 0.5) + scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17)) + 
    ylab("")

We see the same issues with loan purpose factor levels, so I have aggregated very small categories again.

loan %>% group_by(purpose, grade) %>% dplyr::count() %>% ungroup %>% group_by(purpose) %>% 
    mutate(p = n/sum(n), N = sum(n)) %>% ungroup %>% ggplot(., aes(purpose, 
    p, fill = grade, color = grade)) + geom_bar(stat = "identity") + theme_bw() + 
    scale_color_tableau() + scale_fill_tableau() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1, vjust = 1)) + geom_text(aes(x = purpose, y = 1.01, label = paste0("N = ", 
    N)), angle = 90, size = 2.7, hjust = 0, vjust = 0.5, inherit.aes = FALSE) + 
    scale_y_continuous(breaks = c(0, 1), limits = c(0, 1.17))

purpose <- loan$purpose
purpose[purpose %in% c("wedding", "educational")] <- "other"
loan$purpose <- NULL
loan <- loan %>% mutate(purpose = purpose)
rm(purpose)

It is possible that we need to predict the grade of a loan having a term length different than 36 and 60 months, so we’ll treat loan term as a numeric variable.

loan <- loan %>% mutate(term_length_months = case_when(term == "36 months" ~ 
    36, term == "60 months" ~ 60)) %>% dplyr::select(-term)

3 of the categorical variables have more than 50 levels (so I didn’t include bar graphs displaying these distributions.)

long_char_columns <- sapply(loan, is.character) %>% which %>% names %>% sapply(., 
    function(x) {
        length(unique(as.vector(data.frame(loan)[, x])))
    }) %>% magrittr::is_greater_than(50) %>% which %>% names
sapply(loan[, long_char_columns], function(x) {
    length(unique(x))
})
## emp_title       url  zip_code 
##    114475    421095       914

Each loan has its own distinct url, so it isn’t useful for modeling. Information contained in emp_title may be useful for predicting loan grade, but the entries of that column are very unstructured with many levels having only a single observation. and need cleaned up before adding it as a covariate.

Out of the total 114475 unique observed values of emp_title, 91607 of those values are associated with a single loan application. Many of these should be labelled according to the same category; for example, of the unique observed values, 13391 contain the word ‘manager’. If more time were available, one might investigate using word embedding techniques to make use of this column, but for now, I’ll leave it out of the model.

loan <- loan %>% dplyr::select(-emp_title)

There are 914 unique values of zip_code in the dataset. Effects of zip codes with only a few loans may not even be estimable if enough of them aren’t included in the training data, and we won’t be able to make predictions for loans with zip codes not present in the training data.

loan %>% group_by(zip_code) %>% dplyr::count() %>% ungroup %>% arrange(n)
## # A tibble: 914 x 2
##    zip_code     n
##    <chr>    <int>
##  1 007xx        1
##  2 203xx        1
##  3 205xx        1
##  4 269xx        1
##  5 340xx        1
##  6 343xx        1
##  7 348xx        1
##  8 399xx        1
##  9 502xx        1
## 10 503xx        1
## # ... with 904 more rows

To reduce dimensionality and avoid overfitting, we need to aggregate levels of zip_code having small numbers of observations by combining them into a single category. The value for addr_state will capture all of the variability in loan grade attributable to geography for states having only a single value of zip_code, so these zip codes can be aggregated to a single category with no information loss at all as long as state is included in the model. I’ll create a category for zip codes having fewer than 100 loans associated with them. Why 100? While somewhat arbitrary, 200 observations should be adequate for estimating the probability of loan grade for each of A, B, C, D, E, F, and G. Alternatively, you could view the number of levels of a factor considered for splitting trees in the construction of the random forest as a hyperparameter. The number of levels considered then becomes part of the model selection problem.

zip_counts <- loan %>% group_by(zip_code) %>% dplyr::count() %>% ungroup %>% 
    filter(n >= 200)
zip_code <- loan$zip_code
zip_code[!(zip_code %in% zip_counts$zip_code)] <- "other"
loan$zip_code <- NULL
loan <- loan %>% mutate(zip_code = zip_code)

Numeric variables

The bulk of the janitorial work with the numeric variables in the dataset is comprised of dealing with missing values using some old fashioned common sense if possible, eliminating uninformative or highly correlated variables, and addressing any detectable outliers.

kable(loan %>% mutate_if(is.numeric, is.na) %>% summarize_if(is.logical, sum) %>% 
    gather(key = variable, value = NA_count) %>% arrange(desc(NA_count)))
variable NA_count
dti_joint 420586
annual_inc_joint 420584
il_util 402478
mths_since_rcnt_il 400285
all_util 399723
inq_fi 399723
inq_last_12m 399723
max_bal_bc 399723
open_acc_6m 399723
open_il_12m 399723
open_il_24m 399723
open_rv_12m 399723
open_rv_24m 399723
total_bal_il 399723
total_cu_tl 399723
mths_since_last_record 346680
mths_since_recent_bc_dlq 312495
mths_since_last_major_derog 298366
mths_since_recent_revol_delinq 269358
mths_since_last_delinq 203962
mths_since_recent_inq 44599
num_tl_120dpd_2m 19230
mo_sin_old_il_acct 12254
percent_bc_gt_75 4239
bc_util 4227
bc_open_to_buy 3963
mths_since_recent_bc 3798
revol_util 162
weeks_btwn_credit_pull_loan_issue 9
dti 2
num_rev_accts 1
acc_now_delinq 0
acc_open_past_24mths 0
annual_inc 0
avg_cur_bal 0
chargeoff_within_12_mths 0
collection_recovery_fee 0
collections_12_mths_ex_med 0
delinq_2yrs 0
delinq_amnt 0
fico_range_high 0
fico_range_low 0
funded_amnt 0
funded_amnt_inv 0
id 0
inq_last_6mths 0
installment 0
last_fico_range_high 0
last_fico_range_low 0
loan_amnt 0
mort_acc 0
mo_sin_old_rev_tl_op 0
mo_sin_rcnt_rev_tl_op 0
mo_sin_rcnt_tl 0
num_accts_ever_120_pd 0
num_actv_bc_tl 0
num_actv_rev_tl 0
num_bc_sats 0
num_bc_tl 0
num_il_tl 0
num_op_rev_tl 0
num_rev_tl_bal_gt_0 0
num_sats 0
num_tl_30dpd 0
num_tl_90g_dpd_24m 0
num_tl_op_past_12m 0
open_acc 0
out_prncp 0
out_prncp_inv 0
pct_tl_nvr_dlq 0
pub_rec 0
pub_rec_bankruptcies 0
recoveries 0
revol_bal 0
tax_liens 0
total_acc 0
total_bal_ex_mort 0
total_bc_limit 0
total_il_high_credit_limit 0
total_pymnt 0
total_pymnt_inv 0
total_rev_hi_lim 0
tot_coll_amt 0
tot_cur_bal 0
tot_hi_cred_lim 0
cred_tenure_years 0
term_length_months 0

I examined plots of the univariate distributions for degenerate variables and outliers; the column mininum, median, and maximum are indicated by the dotted vertical lines:

gglist <- list()
for (this_column in names(which(sapply(loan[, -match("id", names(loan))], is.numeric)))) {
    if (this_column != c("policy_code", "term_length_months")) {
        gglist <- list.append(gglist, ggplot(loan, aes_string(x = this_column)) + 
            geom_histogram() + theme_bw() + theme(axis.text.x = element_text(angle = 45, 
            hjust = 1, vjust = 1)) + geom_vline(aes(xintercept = min(as.vector(data.frame(loan)[, 
            this_column]), na.rm = T)), alpha = 0.5, linetype = 2) + geom_vline(aes(xintercept = max(as.vector(data.frame(loan)[, 
            this_column]), na.rm = T)), alpha = 0.5, linetype = 2) + geom_vline(aes(xintercept = median(as.vector(data.frame(loan)[, 
            this_column]), na.rm = T)), alpha = 0.5, linetype = 2))
    }
}
marrangeGrob(gglist[1:15], nrow = 5, ncol = 3)

marrangeGrob(gglist[16:30], nrow = 5, ncol = 3)

marrangeGrob(gglist[31:45], nrow = 5, ncol = 3)

marrangeGrob(gglist[46:length(gglist)], nrow = 3, ncol = 3)

The histogram for weeks_btwn_credit_pull_loan_issue or all loans, the most recent credit pull took place after the loan originated. Does this date also correspond to when the loan grade was assigned? If the loan grade was assigned prior to the most recent credit pull, then this variable along with other credit variables collected at the same time should not be used to predict the loan grade. This is something I would verify before proceeding with model fitting.

Additionally, there are some numeric variables with notable high outliers (tax_liens - max of 85, pub_rec - max of 86):

ggplot(loan, aes(tax_liens, pub_rec)) + geom_jitter(aes(color = grade)) + theme_bw() + 
    scale_color_tableau() + theme(legend.position = "bottom") + guides(color = guide_legend(nrow = 1))

The two observations which are the largest outliers with respect to tax_liens correspond to the same observations which are the largest outliers with respect to pub_rec. The plot below of number of public records versus (number of tax liens + number of publicly recorded bankruptcies) indicates that the types of records that comprise the pub_rec count include both tax_liens count and pub_rec_bankruptcies count. What other types of recorded events contribute to this count? From a modeling standpoint, it is preferential to include the marginal counts as covariates rather than an aggregated count.

ggplot(loan, aes(tax_liens + pub_rec_bankruptcies, pub_rec)) + geom_jitter(aes(color = grade)) + 
    theme_bw() + scale_color_tableau() + theme(legend.position = "bottom") + 
    guides(color = guide_legend(nrow = 1))

The “joint” variables are missing for non-joint applications, so in order to use them, we need to impute the most sensical value. For individual applications, we’ll set the joint income and joint dti to the individual income and dti of the applicant. It is likely that, since there are so few joint applications, joint dti and dti will be highly correlated, as will annual income and joint annual income, but this information may be useful for predicting the grade for the small number of loans with joint applications.

loan <- loan %>% mutate(annual_income_joint = case_when(application_type == 
    "Joint App" ~ annual_inc_joint, application_type == "Individual" ~ annual_inc), 
    dti_joint = case_when(application_type == "Joint App" ~ dti_joint, application_type == 
        "Individual" ~ dti)) %>% dplyr::select(-annual_inc_joint)

The last_fico_range_low, last_fico_range_high, fico_range_low, fico_range_high variables are two numeric columns constructed to designate into which of 38 buckets the applicant’s fico score falls.

loan %>% group_by(fico_range_low, fico_range_high) %>% dplyr::count() %>% ungroup %>% 
    arrange(fico_range_low, fico_range_high) %>% mutate(fico_range = fico_range_high - 
    fico_range_low) %>% print(n = Inf)
## # A tibble: 38 x 4
##    fico_range_low fico_range_high     n fico_range
##             <int>           <int> <int>      <int>
##  1            660             664 41324          4
##  2            665             669 39245          4
##  3            670             674 39315          4
##  4            675             679 34479          4
##  5            680             684 33491          4
##  6            685             689 28730          4
##  7            690             694 27897          4
##  8            695             699 24869          4
##  9            700             704 22558          4
## 10            705             709 20530          4
## 11            710             714 17645          4
## 12            715             719 15567          4
## 13            720             724 13002          4
## 14            725             729 10151          4
## 15            730             734  8706          4
## 16            735             739  6709          4
## 17            740             744  5793          4
## 18            745             749  4619          4
## 19            750             754  4100          4
## 20            755             759  3482          4
## 21            760             764  2780          4
## 22            765             769  2610          4
## 23            770             774  2113          4
## 24            775             779  2065          4
## 25            780             784  1781          4
## 26            785             789  1455          4
## 27            790             794  1247          4
## 28            795             799  1065          4
## 29            800             804   917          4
## 30            805             809   767          4
## 31            810             814   611          4
## 32            815             819   474          4
## 33            820             824   359          4
## 34            825             829   264          4
## 35            830             834   160          4
## 36            835             839    98          4
## 37            840             844    62          4
## 38            845             850    55          5
loan %>% group_by(last_fico_range_low, last_fico_range_high) %>% dplyr::count() %>% 
    ungroup %>% arrange(last_fico_range_low, last_fico_range_high) %>% mutate(last_fico_range = last_fico_range_high - 
    last_fico_range_low) %>% print(n = Inf)
## # A tibble: 72 x 4
##    last_fico_range_low last_fico_range_high     n last_fico_range
##                  <int>                <int> <int>           <int>
##  1                   0                    0    59               0
##  2                   0                  499  8629             499
##  3                 500                  504  1944               4
##  4                 505                  509  2074               4
##  5                 510                  514  2424               4
##  6                 515                  519  2599               4
##  7                 520                  524  2883               4
##  8                 525                  529  2963               4
##  9                 530                  534  3403               4
## 10                 535                  539  3418               4
## 11                 540                  544  3768               4
## 12                 545                  549  3620               4
## 13                 550                  554  3960               4
## 14                 555                  559  3875               4
## 15                 560                  564  3943               4
## 16                 565                  569  3688               4
## 17                 570                  574  3893               4
## 18                 575                  579  3558               4
## 19                 580                  584  3824               4
## 20                 585                  589  3437               4
## 21                 590                  594  3688               4
## 22                 595                  599  3527               4
## 23                 600                  604  3599               4
## 24                 605                  609  3537               4
## 25                 610                  614  3697               4
## 26                 615                  619  3596               4
## 27                 620                  624  4040               4
## 28                 625                  629  4004               4
## 29                 630                  634  4714               4
## 30                 635                  639  4789               4
## 31                 640                  644  5543               4
## 32                 645                  649  5962               4
## 33                 650                  654  6831               4
## 34                 655                  659  7944               4
## 35                 660                  664  8810               4
## 36                 665                  669  9930               4
## 37                 670                  674 11754               4
## 38                 675                  679 12207               4
## 39                 680                  684 13864               4
## 40                 685                  689 13773               4
## 41                 690                  694 15151               4
## 42                 695                  699 15068               4
## 43                 700                  704 14805               4
## 44                 705                  709 14896               4
## 45                 710                  714 14166               4
## 46                 715                  719 13908               4
## 47                 720                  724 13423               4
## 48                 725                  729 11715               4
## 49                 730                  734 11526               4
## 50                 735                  739  9701               4
## 51                 740                  744  9336               4
## 52                 745                  749  7919               4
## 53                 750                  754  7596               4
## 54                 755                  759  6874               4
## 55                 760                  764  5789               4
## 56                 765                  769  5944               4
## 57                 770                  774  5115               4
## 58                 775                  779  5030               4
## 59                 780                  784  4583               4
## 60                 785                  789  4072               4
## 61                 790                  794  3931               4
## 62                 795                  799  3296               4
## 63                 800                  804  3201               4
## 64                 805                  809  2699               4
## 65                 810                  814  2221               4
## 66                 815                  819  1820               4
## 67                 820                  824  1301               4
## 68                 825                  829  1038               4
## 69                 830                  834   613               4
## 70                 835                  839   342               4
## 71                 840                  844   174               4
## 72                 845                  850    71               5
loan <- loan %>% dplyr::select(-c(last_fico_range_low, fico_range_low))

Only the high end of the range is necessary to determine which bucket the score belongs to, so I’ll eliminate the range lows from the model frame.

The open_rv_12m, open_rv_24m, open_il_12m, and open_il_24m entries for certain records can be imputed using the number of open accounts of the appropriate type and the number of months since the most recent and oldest account was opened.

open_rv_12m <- loan$open_rv_12m
open_rv_24m <- loan$open_rv_24m
open_il_12m <- loan$open_il_12m
open_il_24m <- loan$open_il_24m
open_rv_12m[loan$mo_sin_rcnt_rev_tl_op > 12 & is.na(open_rv_12m)] <- 0
open_rv_24m[loan$mo_sin_rcnt_rev_tl_op > 24 & is.na(open_rv_24m)] <- 0
open_il_12m[loan$mths_since_rcnt_il > 12 & is.na(open_il_12m)] <- 0
open_il_24m[loan$mths_since_rcnt_il > 24 & is.na(open_il_24m)] <- 0
loan$open_rv_12m <- loan$open_rv_24m <- loan$open_il_24m <- loan$open_il_12m <- NULL
loan <- loan %>% mutate(open_rv_12m = open_rv_12m, open_rv_24m = open_rv_24m, 
    open_il_12m = open_il_12m, open_il_24m = open_il_24m)
rm(open_il_12m)
rm(open_il_24m)
rm(open_rv_12m)
rm(open_rv_24m)

Accounts for which pct_tl_nvr_dlq == 100 do not have any delinquencies associated with them, so their corresponding entries in the mths_since_last_delinq and mths_since_recent_revol_delinq columns are missing. I’ll impute these with a negative integer to differentiate these accounts from those which have had a delinquency event.

loan$mths_since_last_delinq[loan$pct_tl_nvr_dlq == 100] <- -100
loan$mths_since_recent_revol_delinq[loan$pct_tl_nvr_dlq == 100] <- -100

There are other columns with missing values which might be reasonable to impute using common sense and a bit of subject matter knowledge. Further analysis should include investigating how to impute missing values using information in existing columns.

Next, I eliminate any covariates which can be exactly expressed as linear combinations of other variables:

complete_numeric_cols <- names(which(colSums(is.na(loan[, names(which(sapply(loan, 
    is.numeric)))])) == 0))
complete_numeric_cols <- complete_numeric_cols[complete_numeric_cols != "id"]
linear_combos <- caret::findLinearCombos(as.matrix(loan[, complete_numeric_cols]))
linear_combos$linearCombos %>% lapply(., function(x) {
    names(which(sapply(loan, is.numeric)))[x]
})
## [[1]]
## [1] "funded_amnt" "delinq_2yrs"
loan <- loan %>% dplyr::select(-c(funded_amnt, delinq_2yrs))

and for further dimension reduction, we can eliminate covariates which are highly correlated with other covariates:

correlationMatrix <- cor(loan[, names(which(sapply(loan, is.numeric)))], use = "pairwise.complete.obs")
melted_corr <- melt(correlationMatrix)
names(melted_corr) <- c("var1", "var2", "R")
melted_corr %>% ggplot(data = ., aes(var1, var2, fill = R)) + geom_tile(color = "white") + 
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, 
        limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") + theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) + 
    coord_fixed() + xlab("") + ylab("") + theme(axis.text.x = element_text(size = 7))

kable(melted_corr %>% filter(R > 0.85 & var1 != var2))
var1 var2 R
annual_income_joint annual_inc 0.9995550
recoveries collection_recovery_fee 0.9910568
dti_joint dti 0.9744357
dti dti_joint 0.9744357
installment funded_amnt_inv 0.9411558
loan_amnt funded_amnt_inv 0.9999945
total_pymnt funded_amnt_inv 0.8561687
total_pymnt_inv funded_amnt_inv 0.8561567
funded_amnt_inv installment 0.9411558
loan_amnt installment 0.9411739
total_pymnt installment 0.8533709
total_pymnt_inv installment 0.8533395
funded_amnt_inv loan_amnt 0.9999945
installment loan_amnt 0.9411739
total_pymnt loan_amnt 0.8561142
total_pymnt_inv loan_amnt 0.8560929
cred_tenure_years mo_sin_old_rev_tl_op 0.9211809
mths_since_recent_revol_delinq mths_since_last_delinq 0.9938262
mths_since_last_delinq mths_since_recent_revol_delinq 0.9938262
num_rev_tl_bal_gt_0 num_actv_rev_tl 0.9808494
num_actv_rev_tl num_rev_tl_bal_gt_0 0.9808494
open_acc num_sats 0.9986025
num_sats open_acc 0.9986025
out_prncp_inv out_prncp 0.9999981
out_prncp out_prncp_inv 0.9999981
collection_recovery_fee recoveries 0.9910568
total_bal_il total_bal_ex_mort 0.9042258
total_il_high_credit_limit total_bal_ex_mort 0.8645172
total_bal_ex_mort total_bal_il 0.9042258
total_il_high_credit_limit total_bal_il 0.9586317
total_bal_ex_mort total_il_high_credit_limit 0.8645172
total_bal_il total_il_high_credit_limit 0.9586317
funded_amnt_inv total_pymnt 0.8561687
installment total_pymnt 0.8533709
loan_amnt total_pymnt 0.8561142
total_pymnt_inv total_pymnt 0.9999957
funded_amnt_inv total_pymnt_inv 0.8561567
installment total_pymnt_inv 0.8533395
loan_amnt total_pymnt_inv 0.8560929
total_pymnt total_pymnt_inv 0.9999957
tot_hi_cred_lim tot_cur_bal 0.9749433
tot_cur_bal tot_hi_cred_lim 0.9749433
mo_sin_old_rev_tl_op cred_tenure_years 0.9211809
annual_inc annual_income_joint 0.9995550
open_rv_24m open_rv_12m 0.8619908
open_rv_12m open_rv_24m 0.8619908

Note: We could spend much more time with exploratory analysis looking thoroughly at both the bivariate relationships between predictors as well as the bivariate relationships between predictors and loan grade, which could inform both variable selection as well as the form of the relationship between predictors and response. I am forgoing an exhaustive analysis for not only the sake of brevity, but because I am choosing to fit a random forest classifier. While the model fitting procedure for random forests does not invoke any automatic variable selection (fitting procedures for certain classes of models can, such as regularized regression-based methods using lasso penalty or variant thereof), these models are extremely flexible and capable of capturing complex, nonlinear relationships between predictors and the response and interaction effects between predictors. Variable importance measures obtained from fitted models can provide more informative means of variable selection. If variable importance measures indicate that a variable is not informative with respect to accurately predicting the response, then the model may be refit and these variables can be excluded.

loan_mf <- loan %>% dplyr::select(-c(total_pymnt_inv, out_prncp_inv, funded_amnt_inv, 
    total_pymnt, installment, avg_cur_bal, id, annual_income_joint)) %>% mutate_if(is.character, 
    funs(factor))
cache("loan_mf")